Data analysis 1

Modeling

library("tidyverse")
library("broom")
library("ggrepel")
library("here")
augmented_table <- read_tsv("../data/03_augmented_table.tsv")
Rows: 2570560 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): GeneFeature, Sample_names, Patient, IDH_status, Localisation, Prim...
dbl  (7): Quantile, CPM, GEP_score, Age, Batch, TLS_status_bin, CPM_log2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
augmented_metadata <- read_tsv("../data/03_augmented_metadata.tsv")
Rows: 64 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): Sample_names, Patient, IDH_status, Localisation, Primary_or_Recurre...
dbl (4): GEP_score, Age, Batch, TLS_status_bin

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#MODEL 1: CPM log2 transformed ~ TLS_status
nested_data <- augmented_table |>
  group_by(GeneFeature) |>
  nest()  |>
  mutate(model = map(data, 
                     ~lm(CPM_log2 ~ TLS_status_bin, 
                         data = .x)),
         tidy_model = map(model, 
                          ~tidy(.x, conf.int = TRUE)))
gene_results <- nested_data |>
  unnest(tidy_model) |>
  filter(term == "TLS_status_bin") |>
  select(GeneFeature, 
         estimate, 
         p.value, 
         conf.low, 
         conf.high) |>
  mutate(q.value = p.adjust(p.value, 
                            method = "fdr"),
         significant = q.value < 0.01)
#MODEL 2: GEP_score ~ TLS_status
gep_model <- lm(GEP_score ~ TLS_status_bin, 
                data = augmented_metadata)
gep_model_tidy <- tidy(gep_model, 
                       conf.int = TRUE)
#Volcano plot for all genes
volcano_plot <- gene_results |>
  drop_na(estimate, 
          p.value, 
          significant, 
          GeneFeature) |>
  mutate(neg_log10_p = -log10(p.value),
         label = if_else(significant, 
                         GeneFeature, "")) |>
  ggplot(aes(x = estimate, 
             y = neg_log10_p, 
             color = significant)) +
  geom_point(alpha = 0.6) +
  geom_text_repel(aes(label = label), 
                  size = 2, 
                  max.overlaps = 20) +
  geom_hline(yintercept = 0, 
             linetype = "dotted") +
  labs(title = "Volcano Plot of Gene Effects (CPM ~ TLS_status)",
       x = "Effect size (Estimate)",
       y = expression(-log[10](p.value)),
       color = "Significant" ) +
  theme_minimal()
#Print volcano plot
volcano_plot
Warning: ggrepel: 1065 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

ggsave(
  filename = "volcano_plot.png",
  path = here("results"),
  dpi = 300,
  bg = "white"
)
Saving 7 x 5 in image
Warning: ggrepel: 1068 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
#Extracting p value from model 2
tls_p <- gep_model_tidy |>
  filter(term == "TLS_status_bin") |>  
  pull(p.value)
#Plotting GEP score vs TLS status
GEP_score_vs_TLS_status <- ggplot(
  augmented_metadata, 
  aes(x = factor(TLS_status_bin), 
      y = GEP_score, 
      fill = factor(TLS_status_bin))) +
  geom_boxplot(width = 0.5, 
               alpha = 0.8, 
               color = "black") +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
  geom_jitter(
    aes(color = factor(TLS_status_bin)),
    width = 0.07,
    alpha = 0.6,
    size = 2
  ) +
  scale_fill_manual(values = c("0" = "lightsteelblue2", 
                               "1" = "indianred3")) +
  scale_color_manual(values = c("0" = "lightsteelblue4", 
                                "1" = "indianred4")) +
  labs(
    title = paste0(
      "GEP Score by TLS Status (p value = ", signif(tls_p, 3), ")"
    ),
    x = "TLS Status",
    y = "GEP Score") +
  theme_classic(base_size = 15) +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", 
                                  hjust = 0.5),
        axis.title = element_text(face = "bold"))

GEP_score_vs_TLS_status

ggsave(filename = "GEP_score_vs_TLS_status.png",
       path = here("results"),
       dpi = 300)
Saving 7 x 5 in image
#Checking if genes found significant by TLS status overlap with the 18 signature GEP genes
signature_genes <- c(
  "CCL5", "CD27", "CD274", "CD276", "CD8A", "CMKLR1", "CXCL9",
  "CXCR6", "HLA-DQA1", "HLA-DRB1", "HLA-E", "IDO1", "LAG3",
  "NKG7", "PDCD1LG2", "PSMB10", "STAT1", "TIGIT"
)

tls_sig_genes <- gene_results |>
  filter(significant) |>
  pull(GeneFeature)

overlap_genes <- intersect(signature_genes, 
                           tls_sig_genes)

overlap_genes
[1] "CD27"  "CXCR6"
length(overlap_genes)
[1] 2
length(signature_genes)
[1] 18
overlap_table <- tibble(Gene = signature_genes,
                        In_TLS_Significant = signature_genes %in% tls_sig_genes
)

overlap_table
# A tibble: 18 × 2
   Gene     In_TLS_Significant
   <chr>    <lgl>             
 1 CCL5     FALSE             
 2 CD27     TRUE              
 3 CD274    FALSE             
 4 CD276    FALSE             
 5 CD8A     FALSE             
 6 CMKLR1   FALSE             
 7 CXCL9    FALSE             
 8 CXCR6    TRUE              
 9 HLA-DQA1 FALSE             
10 HLA-DRB1 FALSE             
11 HLA-E    FALSE             
12 IDO1     FALSE             
13 LAG3     FALSE             
14 NKG7     FALSE             
15 PDCD1LG2 FALSE             
16 PSMB10   FALSE             
17 STAT1    FALSE             
18 TIGIT    FALSE             
#Plotting overlapping genes
siginifcant_genes_overlap <- ggplot(overlap_table, aes(
  x = In_TLS_Significant,
  y = Gene,
  color = In_TLS_Significant
)) +
  geom_point(size = 4) +
  scale_x_discrete(labels = c("FALSE" = "No", "TRUE" = "Yes")) +
  scale_color_manual(values = c("FALSE" = "lightsteelblue4", "TRUE" = "indianred3")) +
  labs(title = "Which Signature Genes Are TLS-Significant?",
       x = "",
       y = "Gene") +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "none",
    panel.grid.major.y = element_line(color = "grey70", linewidth = 0.5),
    panel.grid.major.x = element_line(color = "grey70", linewidth = 0.5),
    panel.grid.minor.y = element_line(color = "grey70", linewidth = 0.3),
    panel.grid.minor.x = element_line(color = "grey70", linewidth = 0.3),
  )

siginifcant_genes_overlap

ggsave(
  filename = "significant_genes_overlap.png",
  path = here("results"),
  dpi = 300,
  bg = "white"
)
Saving 7 x 5 in image